---
title: "kenney_salchak_appendix"
author: "Maggie Kenney"
date: "2024-12-12"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(ggplot2)
library(broom)
library(SentimentAnalysis)
library(estimatr)
library(stats)
library(modelsummary)
library(purrr)
library(tidyverse)
library(magrittr)
library(sandwich)
library(coefplot)
library(kableExtra)
library(knitr)
library(xtable)
```

```{r}
setwd("Set directory")
firm_sample <- read.csv("firm_sample.csv")
oecd_sample <- read.csv("oecd_sample.csv")
full_sample <- read.csv("kenney_salchak_audit.csv")
```

# First and Last Name Balance
# Table 3 and 4
```{r}
first_names <- read.csv("comb_traitratings.csv")
last_names <- read.csv("lastname_traitratings.csv")

# Table 3

first_names_tab <- first_names %>% 
  filter(name == "Mary" | name == "Jake") %>% 
  t()

xtab_first <- xtable(first_names_tab)

print(xtab_first, file = "table3.tex")

# Table 4

last_names_tab <- last_names %>% 
  filter(name == "Miller" | name == "Williams") %>% 
  t()

xtab_last <- xtable(last_names_tab)

print(xtab_last, file = "table4.tex")
```

# Robustness Checks
# Table 7: Probit Model
```{r}
full <- full_sample %>% 
  filter(delivered == 1)

interview_probit <- glm(scheduled ~ treat, family = binomial(link = "probit"), data = full)
response_probit <- glm(respond ~ treat, family = binomial(link = "probit"), data = full)

probit <- list("Interview Scheduled" = interview_probit, "Email Response" = response_probit)

cm <- c('(Intercept)' = 'Constant', treat = "Jake Miller")

gof <- c("nobs", "r.squared")

modelsummary(probit, output = "table7.tex", coef_omit = "Intercept", caption = "Average Treatment Effect: Gender", coef_map = cm, gof_map = gof, fmt = 3, stars = c('*' = .1, '**' = 0.05, '***' = 0.01))
```

# Table 8: Difference in Proportions
```{r}
total_responses_jake <- full %>%
  filter(treat == 1)

jake_n <- as.numeric(nrow(total_responses_jake))

success_jake <- full %>%
  filter(treat == 1) %>% 
  filter(scheduled == 1)

success_jake_n <- as.numeric(nrow(success_jake))

total_responses_mary <- full %>%
  filter(treat == 0)

mary_n <- as.numeric(nrow(total_responses_mary))

success_mary <- full %>%
  filter(treat == 0) %>% 
  filter(scheduled == 1)

success_mary_n <- as.numeric(nrow(success_mary))

success <- c(success_jake_n, success_mary_n)
sample_size <- c(jake_n, mary_n)

# continuity correction should not be used if p is given
result <- prop.test(success, sample_size, correct = FALSE)

z_stat <- result$statistic
p_value <- result$p.value
conf_int <- result$conf.int

results_list <- data.frame(
  success = success,
  sample_size = sample_size,
  z_stat = z_stat,
  p_value = p_value,
  conf_low = conf_int[1],
  conf_high = conf_int[2]
)

xtab_diff_prop <- xtable(results_list, digits = 4)

print.xtable(xtab_diff_prop, file = "table8.tex")
```

# Exploratory Tests
# Figure 3: Reminder Required
```{r}
reminder_full <- lm_robust(time_reminder ~ treat, data = full)
reminder_firm <- lm_robust(time_reminder ~ treat, data = firm)
reminder_oecd <- lm_robust(time_reminder ~ treat, data = oecd)

models_reminder <- list("Pooled" = reminder_full, "Firm" = reminder_firm, "OECD" = reminder_oecd)

pdf("figure3.pdf")
multiplot(models_reminder, xlab = "Change in Probability Reminder Email Sent", ylab = "Sample", innerCI = 1.645, outerCI = 1.96, lwdInner = 2, lwdOuter = 1, intercept = FALSE)+
  theme_classic()+
  theme(
    axis.title.x = element_text(size = 20),
    axis.title.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    legend.title = element_blank(),
    legend.text = element_text(size = 20),
    axis.text.y = element_blank(),
    plot.title = element_blank())
dev.off()
```

# Figure 4: Time until response
```{r}
time_full <- lm_robust(time_respond ~ treat, data = full)
time_firm <- lm_robust(time_respond ~ treat, data = firm)
time_oecd <- lm_robust(time_respond ~ treat, data = oecd)

models_time <- list("Pooled" = time_full, "Firm" = time_firm, "OECD" = time_oecd)

pdf("figure4.pdf")
multiplot(models_time, xlab = "Change in Time to Response", ylab = "Sample", innerCI = 1.645, outerCI = 1.96, lwdInner = 2, lwdOuter = 1, intercept = FALSE)+
  theme_classic()+
  theme(
    axis.title.x = element_text(size = 20),
    axis.title.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    legend.title = element_blank(),
    legend.text = element_text(size = 20),
    axis.text.y = element_blank(),
    plot.title = element_blank())
dev.off()
```

# Table 9: Sentiment Analaysis
# This analysis is excluded for confidentiality: sharing the exact text publicly may allow for identification of subjects.
```{r}
# full_text <- full %>% 
 #  arrange(text) %>% 
  # filter(respond == 1)
 
# text_doc <- Corpus(VectorSource(full_text$text))
# text_doc <- tm_map(text_doc, content_transformer(tolower))
# text_doc <- tm_map(text_doc, removeNumbers)
# text_doc <- tm_map(text_doc, stripWhitespace)
# text_doc <- tm_map(text_doc, stemDocument)

# sentiment <- analyzeSentiment(text_doc)$SentimentGI

# count_words <- function(text) {
  # Split text into words
#  words <- unlist(strsplit(text, "\\s+"))
  # Count the number of words
#  num_words <- length(words)
#  return(num_words)
# }

# word_lengths <- sapply(text_doc, count_words)

# full_text <- full_text %>% 
 #  arrange(text) %>% 
 #  mutate(sentiment)

# sentiment_lm <- lm(sentiment ~ treat, data = full_text)

# cm_sentiment <- c('(Intercept)' = 'Constant', treat = "Jake Miller")

# modelsummary(sentiment_lm, output = "table7.tex", coef_omit = "Intercept", caption = "Sentiment Analysis", coef_map = cm_sentiment, gof_map = gof, fmt = 3, stars = c('*' = .1, '**' = 0.05, '***' = 0.01))
```

# Table 10: More likely to question Mary, vs Jake
```{r}
full_text <- full %>% 
  filter(respond == 1)

question <- lm_robust(question ~ treat, data = full_text)

cm_question <- c('(Intercept)' = 'Constant', treat = "Jake Miller")

modelsummary(question, output = "table10.tex", coef_omit = "Intercept", caption = "More questions in email responses", coef_map = cm_sentiment, gof_map = gof, fmt = 3, stars = c('*' = .1, '**' = 0.05, '***' = 0.01))
```

# Table 11 and 12: Balance Test - Email Delivery
```{r}
full_dropped <- full_sample %>% 
  filter(delivered == 0) %>% 
  group_by(treat, gender) %>% 
  summarize(count = n())

knitr::kable(full_dropped, format = "latex", caption = "Balance Test: Email Delivery", digits = 1) %>% 
  kable_styling() %>% 
  save_kable(file = "table11.tex")

gender <- lm(delivered ~ gender, data = full_sample)

treat <- lm(delivered ~ treat, data = full_sample)

mods_balance <- list(gender,treat)

cm_balance <- c("gender" = "Elite Gender", "treat" = "Jake Miller")

modelsummary(mods_balance, output = "table12.tex", coef_omit = "Intercept", caption = "Email Delivery Balance", coef_map = cm_balance, gof_map = gof, fmt = 3, stars = c('*' = .1, '**' = 0.05, '***' = 0.01))
```
# Table 13: Differential Attrition after Email Responses
```{r}
# Table 13: Responded with a question, but did not schedule an interview
respond_question <- full %>% 
  filter(delivered == 1) %>% 
  filter(respond == 1) %>% 
  filter(question == 1)

question_responses_jake <- respond_question %>%
  filter(treat == 1)

questions_jake_n <- as.integer(nrow(question_responses_jake))

question_responses_mary <- respond_question %>%
  filter(treat == 0)

questions_mary_n <- as.integer(nrow(question_responses_mary))

data <- rbind("Jake Miller" = questions_jake_n, "Mary Williams" = questions_mary_n)

xtab_13 <- xtable(data)

print.xtable(xtab_13, file = "table13.tex")
```


